Input:




Get list of genes and list of pubmed ids

# Select if results should be filtered to ASD related or not
filter_ASD = FALSE

### Get list of genes and pubmed IDs

# Genes
list_gene_files = list.files('./../SFARI_scraping/genes/')

# Pubmed IDs
get_pubmed_IDs = function(autism=FALSE){
  all_pubmed_ids = c()
  n = 0
  
  for(gene_file in list_gene_files){
    file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
    
    if(autism) file = file %>% filter(Autism.Report == 'Yes')
    
    if(nrow(file)>0){
      pubmed_ids = unique(file$pubmed.ID)
      all_pubmed_ids = unique(c(all_pubmed_ids, pubmed_ids)) 
    }
    
    n = c(n, length(all_pubmed_ids))
  }
  
  all_pubmed_ids = as.character(all_pubmed_ids)
  
  return(list('n'=n, 'all_pubmed_ids'=all_pubmed_ids))
}

get_pubmed_IDs_output = get_pubmed_IDs(filter_ASD)
n = get_pubmed_IDs_output$n
all_pubmed_ids = get_pubmed_IDs_output$all_pubmed_ids
# See change in number of pubmed articles by each new gene (Seems like the number of new articles were beginning to decrease in the end)
pubmed_counts = data.frame('genes' = seq(1,length(n)), 'articles'=n)

ggplotly(pubmed_counts %>% ggplot(aes(genes, articles,color='#434343')) + geom_line() +
  geom_smooth(method='lm', color='#666666', size=0.5) + theme_minimal() + theme(legend.position='none') +
  ggtitle('Increase in number of pubmed articles by each new gene'))
rm(n, get_pubmed_IDs_output, pubmed_counts)

Create gene-pubmed ID (sparse) matrix

# Create gene - pubmedID sparse matrix
SFARI_genes_info = read.csv('./../../Data/SFARI_genes_01-15-2019.csv') %>% mutate('ID'=gene.symbol)

create_gene_pmID_mat = function(autism=FALSE){
  
  # Create empty dataframe
  gene_pmID_mat = data.frame(matrix(0, nrow=length(list_gene_files), ncol=length(all_pubmed_ids)))
  rownames(gene_pmID_mat) = sapply(list_gene_files, function(x) gsub('.csv', '', x))
  colnames(gene_pmID_mat) = all_pubmed_ids
  
  # Fillout dataframe
  for(gene_file in list_gene_files){
    file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
    
    if(autism) file = file %>% filter(Autism.Report == 'Yes')
    
    if(nrow(file)>0){
      pubmed_ids = as.character(unique(file$pubmed.ID))
      gene_pmID_mat[gsub('.csv', '', gene_file), pubmed_ids] = 1 
    }
  }
  
  gene_names = rownames(gene_pmID_mat)
  sparse_gene_pmID_mat = Matrix(as.matrix(gene_pmID_mat), sparse=TRUE)
    
  return(list('gene_names'=gene_names, 'sparse_gene_pmID_mat'=sparse_gene_pmID_mat))
}

gene_pmID_mat_output = create_gene_pmID_mat(filter_ASD)
gene_names = gene_pmID_mat_output$gene_names
gene_pmID_mat = gene_pmID_mat_output$sparse_gene_pmID_mat

plot_data = data.frame('ID' = rownames(gene_pmID_mat), 'n_papers'=rowSums(gene_pmID_mat)) %>% 
            left_join(SFARI_genes_info, by='ID') %>% mutate(gene.score=as.factor(gene.score))

bins = max(plot_data$n_papers)-min(plot_data$n_papers)+1
p1 = ggplotly(plot_data %>% ggplot(aes(n_papers)) + geom_histogram(bins=bins, alpha=0.6) +
         theme_minimal() + ggtitle('Distribution of number of papers per gene'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, n_papers, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + 
              ggtitle('Number of papers per gene (left) and by SFARI score (right)'))

subplot(p1, p2, nrows=1)
rm(p1, p2, bins, plot_data, gene_pmID_mat_output, list_gene_files)
# Remove genes with no articles related to them
if(sum(rowSums(gene_pmID_mat)==0)>0){
  print(paste0('Removing ', sum(rowSums(gene_pmID_mat)==0), ' rows with zero counts from gene-pubmedID count matrix'))
  gene_pmID_mat = gene_pmID_mat[rowSums(gene_pmID_mat)>0,] 
}


Network analysis

Create node and edge list for Network

# EDGES
gene_gene_mat = gene_pmID_mat %*% t(gene_pmID_mat)

edges = summary(gene_gene_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>% 
        mutate(from = gene_names[from], to = gene_names[to]) %>%
        filter(from!=to)

# Convert count to fraction and correct numbers to make them bigger (proportions are too small)
gene_pmID_mat_rowsums = rowSums(gene_pmID_mat)
names(gene_pmID_mat_rowsums) = rownames(gene_pmID_mat)
edges = edges %>% mutate('weight'=count/(gene_pmID_mat_rowsums[from]+gene_pmID_mat_rowsums[to])*100)

# NODES
nodes = SFARI_genes_info %>% dplyr::select(gene.symbol, gene.score, syndromic, number.of.reports) %>% 
        mutate(id = gene.symbol, title=gene.symbol, value=number.of.reports,
               shape=ifelse(syndromic==0, 'circle','square'),
               color_score=case_when(gene.score==1 ~ '#FF7631', gene.score==2 ~ '#FFB100',
                                     gene.score==3 ~ '#E8E328', gene.score==4 ~ '#8CC83F',
                                     gene.score==5 ~ '#62CCA6', gene.score==6 ~ '#59B9C9',
                                     is.na(gene.score) ~ '#b3b3b3')) %>%
        mutate('color'=color_score) %>% filter(!duplicated(id)) %>% 
        filter(gene.symbol %in% unique(c(edges$from, edges$to)))

# Details:
print(paste0('Nodes: ', nrow(nodes),'        Edges: ', nrow(edges)/2))
## [1] "Nodes: 955        Edges: 83966"
print(paste0('Lost ', length(unique(SFARI_genes_info$gene.symbol[!SFARI_genes_info$gene.symbol %in% nodes$id])),
             ' genes because they didn\'t share any publication with other genes'))
## [1] "Lost 98 genes because they didn't share any publication with other genes"
rm(gene_pmID_mat_rowsums)

Scores of the lost genes:

Lost more genes from lower scores, not a single one from score 1

table(SFARI_genes_info$gene.score[!SFARI_genes_info$gene.symbol %in% nodes$gene.symbol], useNA='ifany')
## 
##    2    3    4    5 <NA> 
##    1   10   33   40   15

(Reference) count of original genes by score:

table(SFARI_genes_info$gene.score, useNA='ifany')
## 
##    1    2    3    4    5    6 <NA> 
##   25   62  195  454  175   25  118
rm(gene_gene_mat)
edges %>% ggplot(aes(count, weight)) + geom_point(alpha=0.1, color='#006666', position='jitter') + ylab('proportion') +
          ggtitle('Effect of change from Count to Proportion on edge weights') + theme_minimal()

graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')

Eigencentrality

The correlation between eigencentrality and SFARI scores is not as strong as before

# Eigencentrality
eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr)

plot_data = data.frame('gene.score'= as.factor(vertex_attr(graph, 'gene.score')),
                       'Syndromic'= as.logical(vertex_attr(graph, 'syndromic')),
                       'Eigencentrality'= vertex_attr(graph, 'eigencentrality'))

correlation = cor(nodes$gene.score[!is.na(nodes$gene.score)], eigencentr[!is.na(nodes$gene.score)])
ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() + 
         scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none') +
         ggtitle(paste0('Correlation = ', round(correlation, 4))))
correlation = cor(as.numeric(plot_data$Syndromic), plot_data$Eigencentrality)
ggplotly(plot_data %>% ggplot(aes(Syndromic, Eigencentrality, fill=Syndromic)) + geom_boxplot() + 
     theme_minimal() + ggtitle(paste0('Correlation = ', round(correlation, 4))))
rm(correlation, plot_data, eigen_centrality_output, eigencentr)

The relation between number of papers and eigencentrality is negative for all groups except score 5

plot_data = data.frame('id'=vertex_attr(graph, 'id'), 
                       'eigencentrality'=vertex_attr(graph, 'eigencentrality'), 
                       'n.papers'=vertex_attr(graph, 'value'),
                       'gene.score'=as.factor(vertex_attr(graph, 'gene.score')))

ggplotly(plot_data %>% ggplot(aes(n.papers, eigencentrality, fill=gene.score, color=gene.score)) + 
         geom_point(alpha=0.5, aes(id=id)) + geom_smooth(method='lm', fill=NA, size=0.5) + 
         theme_minimal() + ggtitle('N papers vs Eigencentrality'))
rm(plot_data)

Graph coloured by SFARI score

visIgraph(graph) %>% visOptions(selectedBy='gene.score', highlightNearest=TRUE) %>% 
                     visIgraphLayout(randomSeed=123)

Community detection:

communities = cluster_louvain(graph)
modules = communities %>% membership

color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('module', value=as.numeric(modules)) %>% 
                  set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
                  set_vertex_attr('color', value=color_pal[as.numeric(modules)])

table(modules)
## modules
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   2   2   3   3  21 167   2   7  40 241   2   2  50   2 162 243   2   2 
##  19 
##   2
hm_data = table(vertex_attr(graph, 'gene.score'), vertex_attr(graph, 'module'), useNA='ifany') %>% 
          as.data.frame.matrix %>% as.matrix

heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none', 
          key=FALSE, xlab='Module', ylab='SFARI score', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

rm(communities, moduless, color_pal)
visIgraph(graph) %>% visOptions(selectedBy='module', highlightNearest=TRUE) %>% 
                     visIgraphLayout(randomSeed=123)